In [1]:
import pyarrow as pa
import pyarrow.parquet as pq
# import fiona
from math import radians, cos, sin, asin, sqrt 
import pandas as pd
import numpy as np
import os
from engarde.decorators import none_missing, unique_index, is_shape
import engarde.generic
import engarde.decorators as ed
from pyproj import Geod
# for plotting map Leaflet library
import folium
from folium import plugins
import scipy
from itertools import cycle
import math
from scipy.spatial import distance
import timeit
import time
import datetime
import sys
from IPython.display import display
import matplotlib.colors as colorscale
#for plotting barcharts and histograms and line chart 
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from scipy import signal
import scipy.stats as stats
import pylab as pl
import plotly.plotly as py
import plotly.graph_objs as go
import geojson as jj
from geojson import Feature, Point, FeatureCollection
import colorsys
import random
from pykalman import KalmanFilter
from scipy import stats 
plt.rcParams['figure.figsize'] = 13,8
import matplotlib.ticker as mtick
import warnings
warnings.filterwarnings('ignore')
# warnings.filterwarnings(action='once')
import plotly as pyPlot
pyPlot.tools.set_credentials_file(username='ladhar.ravishankar', api_key='uzbvUCS23VktDu0VC3l0')
pyPlot.tools.set_config_file(world_readable=True, sharing='public')
import json
In [3]:
#start_time = timeit.default_timer()
result_frame = pq.read_table('/bigdata0/rshankar/Zone18_2011/02/Broadcast.parquet').to_pandas()
result_frame = result_frame.sort_values(['MMSI', 'BaseDateTime'])

result_frame['CheckMMSI'] = (result_frame['MMSI'] != result_frame['MMSI'].shift(1))


#shifting the whole dataframe rows and calculating radians latitude
result_frame['latRad'] = np.where(result_frame['CheckMMSI'] != True , result_frame['lat'].shift(1).apply(radians), np.nan)
result_frame['lonRad'] = np.where(result_frame['CheckMMSI'] != True , result_frame['lon'].shift(1).apply(radians), np.nan)

#taking radinas for the small values
lat1 = result_frame['lat'].apply(radians)
lon1 = result_frame['lon'].apply(radians)

#taking difference for latitude and longitude
dlon = result_frame['lonRad'] - lon1 
dlat = result_frame['latRad'] - lat1

#calculating te distance
harvesineDist = (dlat/2).apply(sin)**2 + lat1.apply(cos) * result_frame['latRad'].apply(cos) * (dlon/2).apply(sin)**2
c = 2* np.arcsin(harvesineDist.apply(sqrt))
dist = 6367 * c

# c = 2 * (harvesineDist.apply(sqrt).apply(asin))
# dist =  * c

##--- Creating a new Data frame for Distance
list_dist =  pd.DataFrame({"Distance": dist})
#concatinating the Dataframe with the old Dataframe
result_frame = pd.concat([result_frame, list_dist], axis=1)
result_frame


#chaning the datetime from string to datetime stamp
result_frame['DateTime'] = pd.DatetimeIndex(result_frame['BaseDateTime'])
#df_Broadcast['DateTime']
#shfting the date on bool value and taking out the difference of two dates
result_frame['DateTime2'] = np.where(result_frame['CheckMMSI'] != True , result_frame['DateTime'].shift(1), result_frame['DateTime'])
result_frame['time_diff'] = (result_frame['DateTime'] - result_frame['DateTime2']) 
#calculating the total hours from baseDateTime
time = pd.DatetimeIndex(result_frame['time_diff'])
result_frame['totalHours'] = time.hour + (time.minute/60) + (time.second/3600)
result_frame= result_frame.reset_index()
#df_Broadcast


del result_frame['latRad']
del result_frame['lonRad']
del result_frame['DateTime2']
#del result_frame['time_diff']
del result_frame['CheckMMSI']

result_frame['Speed'] = result_frame['Distance'] / result_frame['totalHours']
result_frame = result_frame.replace([np.inf, -np.inf], np.nan)
result_frame= result_frame.replace([np.inf, -np.inf], np.nan).dropna(subset=["Speed"], how="all")


#print (timeit.default_timer() - start_time)
In [4]:
df = result_frame
In [5]:
df1 = df.set_index(pd.DatetimeIndex(df.DateTime))
variable1 = df1.groupby(['VoyageID']).rolling("5T").Speed.mean().reset_index(name= "rolling_mean").rename(columns={"VoyageID": "VoyageID2", "DateTime": "DT2"})
df = df.sort_values(['VoyageID', 'DateTime'], ascending=True).reset_index()
df = pd.concat([df, variable1], axis=1)
In [6]:
df_test = df
In [7]:
statinfo = os.stat('/bigdata0/rshankar/Zone18_2011/02/Broadcast.parquet')
statinfo
Out[7]:
os.stat_result(st_mode=33188, st_ino=2802187, st_dev=46, st_nlink=1, st_uid=65534, st_gid=4294967294, st_size=457694422, st_atime=1549136457, st_mtime=1547259449, st_ctime=1547259449)
In [8]:
df_test['Stopped']= df_test.groupby('VoyageID').apply(lambda x: x.rolling_mean.isnull()).values | df_test.groupby('VoyageID').apply(lambda x: x.rolling_mean >= 0.5).values
df_test['subVoyageIDs']= (df_test.groupby('VoyageID').apply(lambda x:x.Stopped.shift(1)) != df_test.groupby('VoyageID').apply(lambda x:x.Stopped) | (df_test.groupby('VoyageID').apply(lambda x: x.rolling_mean <= 0.5).values)).astype(int).cumsum().values
df_testResult= df_test[df_test['Stopped'] == True]
In [9]:
dataPoints = df_testResult.set_index(pd.DatetimeIndex(df_testResult.DateTime))
dataPoints= dataPoints.rename(columns={"index": "indexCol"})
In [29]:
df_CenteredWin =  dataPoints
In [30]:
datapoints= df_CenteredWin.groupby('subVoyageIDs').agg({'Speed':'count'}).reset_index().rename(columns ={'Speed':'totalPoints'})
datapoints = datapoints[datapoints.totalPoints < 10]
In [31]:
df_CenteredWin = df_CenteredWin[~(df_CenteredWin.subVoyageIDs.isin(datapoints.subVoyageIDs.unique()))]

Breaking into time data

In [32]:
df_CenteredWin['noise_timegaps']= df_CenteredWin.groupby('subVoyageIDs').apply(lambda x: (x.DateTime.diff().dt.seconds/3600 > 1)).values
df_CenteredWin['sub_subVoyageIDs']=  (df_CenteredWin.groupby('subVoyageIDs').apply(lambda x:x.noise_timegaps.shift(1)) != df_CenteredWin.groupby('subVoyageIDs').apply(lambda x:x.noise_timegaps) | (df_CenteredWin.groupby('subVoyageIDs').apply(lambda x: x.DateTime.diff().dt.seconds/3600 > 1).values)).astype(int).cumsum().values
In [33]:
total_Points = df_CenteredWin.groupby('sub_subVoyageIDs').agg({'Speed':'count'}).reset_index().rename(columns={'Speed':'totalPoints'})
total_Points = total_Points[total_Points.totalPoints > 10]

df_CenteredWin = df_CenteredWin[df_CenteredWin.sub_subVoyageIDs.isin(total_Points.sub_subVoyageIDs.unique())]

Creating new dataframe and applying Z-Score

In [34]:
dfdata = pd.DataFrame(data = { 'MMSI': df_CenteredWin.MMSI,  
                        'VoyageID' : df_CenteredWin.VoyageID,
                        'subVoyageIDs' : df_CenteredWin.subVoyageIDs,
                        'indexCol' : df_CenteredWin.indexCol,
                        'subVoyageIDs_subV' : df_CenteredWin.sub_subVoyageIDs,
                        'indexCol': df_CenteredWin.indexCol,
                        'DateTime' : df_CenteredWin.DateTime,
                        'distance' : df_CenteredWin.Distance,
                        'Speed' : df_CenteredWin.Speed,
                        'totalHours' : df_CenteredWin.totalHours,
                        'lat' : df_CenteredWin.lat,
                        'lon' :  df_CenteredWin.lon})
In [35]:
def centerMeanWin(data):
    max_timestamp= pd.Timestamp(data.DateTime.max())
    min_timestamp= pd.Timestamp(data.DateTime.min())
    period = '60T'
    reverse_df = data.set_index(max_timestamp - (data.index - min_timestamp)).sort_index()
    win_left = reverse_df.Speed.rolling(period, closed='left').sum()
    win_left= win_left.reset_index()
    win_left= win_left.set_index(pd.DatetimeIndex(win_left.DateTime))
    count_Left= win_left.Speed.rolling(period).count()
    win_left = win_left.set_index(min_timestamp + (max_timestamp - win_left.index))
    
    win_right = data.Speed.rolling(period, closed='right').sum()
    win_right= win_right.reset_index()
    win_right = win_right.set_index(pd.DatetimeIndex(win_right.DateTime))
    count_right= win_right.Speed.rolling(period).count()
    centeredMeanwin = (win_left.Speed.fillna(0) + win_right.Speed.fillna(0))/(count_Left.values + count_right.values)
    return centeredMeanwin

def centerSTDWin(data):
    max_timestamp= pd.Timestamp(data.DateTime.max())
    min_timestamp= pd.Timestamp(data.DateTime.min())
    period = '60T'
    reverse_df = data.set_index(max_timestamp - (data.index - min_timestamp)).sort_index()
    win_left = reverse_df.Speed.rolling(period, closed='left').sum()
    win_left= win_left.reset_index()
    win_left= win_left.set_index(pd.DatetimeIndex(win_left.DateTime))
    count_Left= win_left.Speed.rolling(period).count()
    win_left = win_left.set_index(min_timestamp + (max_timestamp - win_left.index))
    win_right = data.Speed.rolling(period, closed='right').sum()
    win_right= win_right.reset_index()
    win_right = win_right.set_index(pd.DatetimeIndex(win_right.DateTime))
    count_right= win_right.Speed.rolling(period).count()

    #------------- Claculation of the centered Mean value  -------------#
    centeredMeanwin = (win_left.Speed.fillna(0) + win_right.Speed.fillna(0))/(count_Left.values + count_right.values)
    #------------- End of the Claculation of the centered Mean value  -------------#

    # after taking centred mean, take the centered Standard deviation from that mean values
    #------------- Starts with the standard deviation formula -------------#
    squareVal = np.square(data.Speed - centeredMeanwin)
    sum_SquareVal = squareVal.rolling(period).sum()
    count_SquareVal = squareVal.rolling(period).count()
    centeredSTD = np.sqrt(sum_SquareVal / count_SquareVal)
    #------------- Ends the standard deviation formula -------------#
    return centeredSTD
In [36]:
dfdata['centeredMean']= dfdata.groupby('subVoyageIDs_subV').apply(lambda x: centerMeanWin(x)).values
dfdata['centeredSTD']= dfdata.groupby('subVoyageIDs_subV').apply(lambda x: centerSTDWin(x)).values
dfdata['centeredZScore'] = (dfdata.Speed - dfdata.centeredMean) / (dfdata.centeredSTD)
In [37]:
dfdata['zScore_Noise'] = np.where(dfdata['centeredZScore'] >= 1.0, 'noise', 'noisefree')

Zone Boundaries

In [20]:
dfdata['zScore_Noise'] = np.where((dfdata.lon.between(-72.011111, -72.000000)), 'outofzone', dfdata['zScore_Noise'])
dfdata['zScore_Noise'] = np.where((dfdata.lon.between(-77.999999,-77.999000)), 'outofzone_End', dfdata['zScore_Noise'])
In [21]:
len(dfdata[dfdata['zScore_Noise'] == 'outofzone'])
Out[21]:
1094
In [22]:
len(dfdata[dfdata['zScore_Noise'] == 'outofzone_End'])
Out[22]:
41
In [23]:
# dfdata= dfdata[~(dfdata['zScore_Noise'] == 'outofzone_End') | (dfdata['zScore_Noise'] == 'outofzone')] 
In [24]:
len(dfdata[dfdata['zScore_Noise'] == 'noise'])
Out[24]:
536360

Calculation Angle:

In [38]:
dfdata['lat1'] =  dfdata.groupby('subVoyageIDs_subV').apply(lambda x: x.lat.shift(1)).values
dfdata['lat_1'] =  dfdata.groupby('subVoyageIDs_subV').apply(lambda x: x.lat.shift(-1)).values
dfdata['lon1'] =  dfdata.groupby('subVoyageIDs_subV').apply(lambda x: x.lon.shift(1)).values
dfdata['lon_1'] =  dfdata.groupby('subVoyageIDs_subV').apply(lambda x: x.lon.shift(-1)).values
In [39]:
# finding angle between three points
def getAngle(data):
    x1 = data.lat1.values
    x2 = data.lat.values
    x3 = data.lat_1.values
    
    y1 = data.lon1.values
    y2 = data.lon.values
    y3 = data.lon_1.values
    
    a = np.array(list(zip(x1, y1)))
    b = np.array(list(zip(x2, y2)))
    c = np.array(list(zip(x3, y3)))

    ba = a - b
    bc = c - b
    dotPro= [np.dot(a,b) for a,b in zip(bc, ba)]
    normA = [np.linalg.norm(a) for a in ba]
    normB = [np.linalg.norm(b) for b in bc]
    
    cosine_angle = [m/(n*o) for m, n, o in zip(dotPro, normA, normB)]
    angle = np.arccos(cosine_angle)
    angles= np.degrees(angle)
    return angles
In [40]:
dfdata['angle'] =  getAngle(dfdata)
In [41]:
df_data = dfdata

Distance distributions

In [28]:
fig, ax = plt.subplots(figsize=(16,10))
ax.scatter(dfdata.angle, dfdata['distance'])
ax.set_xlabel('angle')
ax.set_ylabel('distance')
plt.show()

Bounding Box

In [43]:
maxBounding = dfdata.groupby('subVoyageIDs_subV').agg({'lat':'max', 'lon':'max'}).reset_index()
minBounding = dfdata.groupby('subVoyageIDs_subV').agg({'lat':'min', 'lon':'min'}).reset_index()
boundingBox = maxBounding.merge(minBounding, on='subVoyageIDs_subV')
boundingBox = boundingBox.rename(columns={'lat_x':'lat_max', 'lon_x':'lon_max', 'lat_y':'lat_min', 'lon_y':'lon_min'})
In [44]:
boundingBox['distance'] = np.sqrt(np.square(boundingBox.lat_max - boundingBox.lat_min) + np.square(boundingBox.lon_max - boundingBox.lon_min)).values
In [45]:
# boundingBox[boundingBox['subVoyageIDs_subV'] == 44] 
# boundingBox[boundingBox.distance == 8.921442425626267]
In [46]:
boundingBoxSubV= boundingBox[boundingBox.distance < 0.09]
In [47]:
len(boundingBoxSubV.subVoyageIDs_subV.unique())
Out[47]:
36909
In [48]:
boundingBoxData= dfdata[(dfdata.subVoyageIDs_subV.isin(boundingBoxSubV.subVoyageIDs_subV.unique()))]
In [49]:
bbUnique = boundingBoxData.subVoyageIDs_subV.unique()
In [699]:
for idx, subVV in enumerate(falsesubV[10:60]):
    display(subVV)
    subV_short = shortsubV[shortsubV['subVoyageIDs'] == subVV]
    plt.plot(subV_short.lon, subV_short.lat)
    plt.show()
4462
4601
4745
6816
6845
7473
7671
8862
9035
9060
9061
9068
9072
9085
9092
9098
9390
9394
9441
9446
9458
9492
9495
9498
9501
9513
9515
9551
9558
9559
9573
9691
9706
9721
9734
9947
9950
9990
9991
9997
10018
10042
10047
10054
10057
10072
10074
10075
10077
10094
In [442]:
# subV = shortsubV[shortsubV['subVoyageIDs_subV'] == 10647965] 

# coordinates = list(zip(subV.lat.tolist(), subV.lon.tolist()))
# m = folium.Map(location=[41.8240, -71.4128])
# folium.PolyLine(coordinates, color = "Black").add_to(m)
# m
In [50]:
dfdata= dfdata[~(dfdata.subVoyageIDs_subV.isin(boundingBoxSubV.subVoyageIDs_subV.unique()))]
In [51]:
dataNoise = dfdata[dfdata['zScore_Noise'] == 'noise']
In [52]:
len(dataNoise)
Out[52]:
487981
In [53]:
# distanceNoise = dataNoise[dataNoise.distance.between(100, 200)]
distAngleNoise = dataNoise[dataNoise.angle.between(0 , 100)]
len(distAngleNoise)
Out[53]:
1627
In [54]:
LdistanceNoise = dataNoise[dataNoise.distance.between(10, dataNoise.distance.max())]
LdistAngleNoise = LdistanceNoise[LdistanceNoise.angle.between(0 , 180)]
len(LdistAngleNoise)
Out[54]:
462
In [55]:
largeNoise =  dataNoise[(np.isnan(dataNoise.angle) == True) & (dataNoise.Speed > 100) & (dataNoise.distance > 10)]
len(largeNoise)
Out[55]:
0
In [56]:
dfnoise = dfdata[((dfdata.Speed > 100) & (dfdata.distance > 100)) & (np.isnan(dfdata.angle) == True)]
len(dfnoise)
Out[56]:
16
In [57]:
rawData= dfdata[(dfdata.subVoyageIDs_subV.isin(distAngleNoise.subVoyageIDs_subV.unique())) | (dfdata.subVoyageIDs_subV.isin(LdistAngleNoise.subVoyageIDs_subV.unique())) | (dfdata.subVoyageIDs_subV.isin(largeNoise.subVoyageIDs_subV.unique())) | (dfdata.subVoyageIDs_subV.isin(dfnoise.subVoyageIDs_subV.unique())) ] 
In [58]:
display(len(rawData))
display(len(rawData.subVoyageIDs_subV.unique()))
640123
1002
In [59]:
totalNoise = rawData[(rawData.indexCol.isin(distAngleNoise.indexCol)) | (rawData.indexCol.isin(LdistAngleNoise.indexCol)) | (rawData.indexCol.isin(largeNoise.indexCol)) | (rawData.indexCol.isin(dfnoise.indexCol))]
In [60]:
cleanedData= rawData[~(rawData.indexCol.isin(totalNoise.indexCol))]
In [61]:
display(len(cleanedData))
display(len(cleanedData.subVoyageIDs_subV.unique()))
638042
1002
In [62]:
df_cleaned= dfdata[dfdata.indexCol.isin(cleanedData.indexCol)]
In [63]:
del df_cleaned['lat1']
del df_cleaned['lat_1']
del df_cleaned['lon1']
del df_cleaned['lon_1']
del df_cleaned['totalHours']
In [64]:
# Calculatying the new distance and the new speed after remocing the actual noise points

df_cleaned['ChecksubVID']=df_cleaned.subVoyageIDs_subV !=  df_cleaned.subVoyageIDs_subV.shift(1)
df_cleaned['latrad']= np.where(df_cleaned['ChecksubVID'] != True, df_cleaned.lat.shift(1).apply(radians), np.nan)
df_cleaned['lonrad']= np.where(df_cleaned['ChecksubVID'] != True, df_cleaned.lon.shift(1).apply(radians), np.nan)
lat1 = df_cleaned['lat'].apply(radians)
lon1 = df_cleaned['lon'].apply(radians)
dlon = df_cleaned['lonrad'] - lon1 
dlat = df_cleaned['latrad'] - lat1
harvesineDist = (dlat/2).apply(sin)**2 + lat1.apply(cos) * df_cleaned['latrad'].apply(cos) * (dlon/2).apply(sin)**2
c = 2* np.arcsin(harvesineDist.apply(sqrt))
dist = 6367 * c
df_cleaned['newdist'] = dist
# totalHours = df_cleaned.DateTime.diff().dt.seconds/3600
df_cleaned['totalHours'] =np.where(df_cleaned['ChecksubVID'] != True, df_cleaned.DateTime.diff().dt.seconds/3600, np.nan)
df_cleaned['newspeed']= df_cleaned['newdist'] / df_cleaned['totalHours']
In [65]:
del df_cleaned['latrad']
del df_cleaned['lonrad']
del df_cleaned['ChecksubVID']
del df_cleaned['totalHours']
In [66]:
# # breaking trajectories based on new distance and the new speed, hwere it seems having the large gaps of speed and distance

# df_cleaned['dist_groups']= df_cleaned.groupby('subVoyageIDs_subV').apply(lambda x: ((x.newdist > 10) & (x.newspeed > 50))).values
# df_cleaned['dist_ids']=  (df_cleaned.groupby('subVoyageIDs_subV').apply(lambda x:x.dist_groups.shift(1)) != df_cleaned.groupby('subVoyageIDs_subV').apply(lambda x:x.dist_groups) | (df_cleaned.groupby('subVoyageIDs_subV').apply(lambda x: ((x.newdist > 10) & (x.newspeed > 100) )).values)).astype(int).cumsum().values

df_cleaned['dist_groups']= df_cleaned.groupby('subVoyageIDs_subV').apply(lambda x: ((x.newdist > 10) & (x.newspeed > 50))).values
df_cleaned['dist_ids']=  ((df_cleaned.groupby('subVoyageIDs_subV').apply(lambda x:x.dist_groups.shift(1)) != (df_cleaned.groupby('subVoyageIDs_subV').apply(lambda x:x.dist_groups))) | (df_cleaned.groupby('subVoyageIDs_subV').apply(lambda x: ((x.newdist > 10) & (x.newspeed > 100) )).values)).astype(int).cumsum().values
In [67]:
totalPoints= df_cleaned.groupby('dist_ids').agg({'Speed':'count'}).reset_index().rename(columns={'Speed':'totalPoints'})
totalPoints= totalPoints[totalPoints.totalPoints > 10]
cleanedData= df_cleaned[df_cleaned.dist_ids.isin(totalPoints.dist_ids.unique())]
In [71]:
uniqueRawIDs = rawData.subVoyageIDs_subV.unique()
In [72]:
# Creating geometery from the Raw Data

geos = []
for idx, number in  enumerate(uniqueRawIDs):
    subV = rawData[rawData['subVoyageIDs_subV'] == number]
    geoRawData =  [[lon,lat] for lon,lat in zip(subV.lon ,subV.lat)]
    poly = {
            "type": "Feature",
            "properties":{"sample": len(subV.subVoyageIDs_subV), 'id': float(subV.subVoyageIDs_subV.unique())},            
            "geometry" : {
            "type" : "LineString",
            "coordinates" : geoRawData,
        }
    }
    geos.append(poly)

geometry = FeatureCollection(geos)     
# geometry
In [73]:
json.dump(geometry, open("Zone18_2011_02_rawGeometery.geojson","w"))
In [74]:
uniqueCleanedIDs = cleanedData.dist_ids.unique()
In [75]:
# Creating geometery from the Raw Data

geos = []
for idx, number in  enumerate(uniqueCleanedIDs):
    subV = cleanedData[cleanedData['dist_ids'] == number]
    geoCleanedData =  [[lon,lat] for lon,lat in zip(subV.lon ,subV.lat)]
    poly = {
            "type": "Feature",
            "properties":{"sample": len(subV.subVoyageIDs_subV), 'id': float(subV.subVoyageIDs_subV.unique())},            
            "geometry" : {
            "type" : "LineString",
            "coordinates" : geoCleanedData,
        }
    }
    geos.append(poly)

geometry = FeatureCollection(geos)     
# geometry
In [76]:
json.dump(geometry, open("Zone18_2011_02_cleanedGeometery.geojson","w"))
In [78]:
# creating the geometery from thr rawData
uniqueRawIDs = dfdata.subVoyageIDs_subV.unique()

geos = []
for idx, number in  enumerate(uniqueRawIDs):
    subV = dfdata[dfdata['subVoyageIDs_subV'] == number]
    geoLDISTData =  [[lon,lat] for lon,lat in zip(subV.lon ,subV.lat)]
    poly = {
            "type": "Feature",
            "properties":{"sample": len(subV.subVoyageIDs_subV), 'id': float(subV.subVoyageIDs_subV.unique())},            
            "geometry" : {
            "type" : "LineString",
            "coordinates" : geoLDISTData,
        }
    }
    geos.append(poly)

geometry = FeatureCollection(geos)
In [79]:
json.dump(geometry, open("Zone18_2011_02_rawGeometery_dfdata.geojson","w"))
In [80]:
# creating the geometery from thr rawData
geos = []
for idx, number in  enumerate(bbUnique):
    subV = boundingBoxData[boundingBoxData['subVoyageIDs_subV'] == number]
    geoLDISTData =  [[lon,lat] for lon,lat in zip(subV.lon ,subV.lat)]
    poly = {
            "type": "Feature",
            "properties":{"sample": len(subV.subVoyageIDs_subV), 'id': float(subV.subVoyageIDs_subV.unique())},            
            "geometry" : {
            "type" : "LineString",
            "coordinates" : geoLDISTData,
        }
    }
    geos.append(poly)

geometry = FeatureCollection(geos)
In [81]:
json.dump(geometry, open("Zone18_2011_02_boundingboxGeometery.geojson","w"))
In [83]:
d = {'Vessels': [len(df_CenteredWin.MMSI.unique())], 'Trajectories': len(df_CenteredWin.VoyageID.unique()), 'subTrajectories': len(df_CenteredWin.subVoyageIDs.unique()), 'subTrajectories Broke on Time': len(df_data.subVoyageIDs_subV.unique()), 'Short SubTrajectories or bad Data': len(boundingBoxSubV.subVoyageIDs_subV.unique()), 
                'Remaining Data after': len(dfdata.subVoyageIDs_subV.unique()), 'Raw Trajectories': len(rawData.subVoyageIDs_subV.unique()), 
                'Cleaned Trajectories': len(cleanedData.subVoyageIDs_subV.unique()), 'Noise Points' : len(totalNoise)}
staticsData = pd.DataFrame(data=d)
display(staticsData)
staticsDataTable= staticsData.stack().reset_index(name = "TotalData").rename(columns={'level_1': 'label'})
del staticsDataTable['level_0']
print(staticsDataTable)
Cleaned Trajectories Noise Points Raw Trajectories Remaining Data after Short SubTrajectories or bad Data Trajectories Vessels subTrajectories subTrajectories Broke on Time
0 995 2081 1002 17346 36909 4971 1614 52500 54255
                               label  TotalData
0               Cleaned Trajectories        995
1                       Noise Points       2081
2                   Raw Trajectories       1002
3               Remaining Data after      17346
4  Short SubTrajectories or bad Data      36909
5                       Trajectories       4971
6                            Vessels       1614
7                    subTrajectories      52500
8      subTrajectories Broke on Time      54255
In [ ]:
ax = staticsDataTable.plot.bar(x='label', y='TotalData')
ax.tick_params(axis='x', labelrotation=90)
plt.show()
In [82]:
subV=dfdata[dfdata['subVoyageIDs_subV'] == 30002]
subVnoise= subV[subV.zScore_Noise == 'noise']

plt.plot(subV.Speed, '-go')
plt.show()

plt.plot(subV.distance, '-go')
plt.show()

plt.plot(subV.lon, subV.lat, '-go')
plt.plot(subVnoise.lon, subVnoise.lat, 'ro')
plt.show()
In [718]:
fig, ax = plt.subplots(figsize=(16,10))
ax.scatter(totalNoise.angle, totalNoise.distance)
ax.set_xlabel('angle')
ax.set_ylabel('distance')
plt.show()
In [99]:
dataIDs = cleanedData.subVoyageIDs.unique()
In [100]:
# Creating geometery from the Raw Data

geos = []
for idx, number in  enumerate(dataIDs):
    subV = cleanedData[cleanedData['subVoyageIDs'] == number]
    geoCleanedData =  [[lon,lat] for lon,lat in zip(subV.lon ,subV.lat)]
    poly = {
            "type": "Feature",
            "properties":{"sample": len(subV.subVoyageIDs), 'id ': float(subV.subVoyageIDs.unique())},            
            "geometry" : {
            "type" : "LineString",
            "coordinates" : geoCleanedData,
        }
    }
    geos.append(poly)

geometry = FeatureCollection(geos)     
# geometry
In [101]:
json.dump(geometry, open("/bigdata0/rshankar/geofiles/cleanedGeoData/Zone18_02_geometery.geojson","w"))
In [102]:
noiseIDs = rawData.subVoyageIDs.unique()
In [103]:
# Creating geometery from the Raw Data

geos = []
for idx, number in  enumerate(noiseIDs):
    subV = rawData[rawData['subVoyageIDs'] == number]
    geoNoiseData =  [[lon,lat] for lon,lat in zip(subV.lon ,subV.lat)]
    poly = {
            "type": "Feature",
            "properties":{"sample": len(subV.subVoyageIDs), 'id ': float(subV.subVoyageIDs.unique())},            
            "geometry" : {
            "type" : "LineString",
            "coordinates" : geoNoiseData,
        }
    }
    geos.append(poly)

geometry = FeatureCollection(geos)     
# geometry
In [104]:
json.dump(geometry, open("/bigdata0/rshankar/geofiles/rawGeoData/Zone18_02_rawGeometery.geojson","w"))
In [727]:
largeDistCleanedData = largeDistData.subVoyageIDs.unique()
In [728]:
# Creating geometery from the Raw Data

geos = []
for idx, number in  enumerate(largeDistCleanedData):
    subV = largeDistData[largeDistData['subVoyageIDs'] == number]
    geoLDData =  [[lon,lat] for lon,lat in zip(subV.lon ,subV.lat)]
    poly = {
            "type": "Feature",
            "properties":{"sample": len(subV.subVoyageIDs), 'id ': float(subV.subVoyageIDs.unique())},            
            "geometry" : {
            "type" : "LineString",
            "coordinates" : geoLDData,
        }
    }
    geos.append(poly)

geometry = FeatureCollection(geos)     
# geometry
In [729]:
json.dump(geometry, open("/bigdata0/rshankar/geofiles/cleanedGeoData/Zone18_02_LDistGeometery.geojson","w"))
In [105]:
# map data to show rawdata, then noise data and circles to point out the noise points.


rawData = os.path.join('/bigdata0/rshankar/geofiles/rawGeoData', 'Zone18_02_rawGeometery.geojson')
CleanedData = os.path.join('/bigdata0/rshankar/geofiles/cleanedGeoData', 'Zone18_02_geometery.geojson') 
# CleanedLDistData = os.path.join('/bigdata0/rshankar/geofiles/cleanedGeoData', 'Zone18_02_LDistGeometery.geojson') 

def style_function(feature):
    return {
        'fillOpacity': 0.2,
        'weight': 2,
        'color': 'red'
    }


def style_functionA(feature):
    return {
        'fillOpacity': 0.2,
        'weight': 2,
        'color': 'Yellow'
    }



m = folium.Map(
    location=[-59.1759, -11.6016], tiles='stamenterrain', zoom_start=5
)

folium.TileLayer('openstreetmap').add_to(m)
folium.TileLayer('StamenToner').add_to(m)
folium.TileLayer('stamenterrain').add_to(m)
folium.TileLayer('Mapbox Control Room').add_to(m)

folium.GeoJson(
    rawData,
    name='rawData',
    style_function= style_functionA
).add_to(m)

folium.GeoJson(
    CleanedData,
    name='CleanedData',
    style_function= style_function
    
).add_to(m)


# datacord = list(zip(noiseData_angle.lat, noiseData_angle.lon))
# for datacordlatlon in datacord:
#     folium.CircleMarker(datacordlatlon, radius=10,  color='green').add_to(m)


folium.LayerControl().add_to(m)
m.save('geometry_zone18_02.html')
In [561]:
angleUnique = dataNoiseRecord_distAngle.subVoyageIDs_subV.unique()
In [562]:
Angledata = dfdata[dfdata.subVoyageIDs_subV.isin(dataNoiseRecord_distAngle.subVoyageIDs_subV)]
Angledata
Out[562]:
DateTime Speed VoyageID distance indexCol lat lon subVoyageIDs subVoyageIDs_subV totalHours centeredMean centeredSTD centeredZScore zScore_Noise lat1 lat_1 lon1 lon_1 angle
DateTime
2011-02-01 11:45:59 2011-02-01 11:45:59 17.547895 2 0.287591 280465 40.768213 -74.010837 9021 25 0.016389 NaN NaN NaN noisefree NaN 40.777332 NaN -74.003108 NaN
2011-02-01 11:47:00 2011-02-01 11:47:00 53.842918 2 0.912338 280979 40.777332 -74.003108 9021 25 0.016944 490.855499 437.012581 -1.000000 noisefree 40.768213 40.783098 -74.010837 -73.998863 176.077313
2011-02-01 11:48:00 2011-02-01 11:48:00 44.015209 2 0.733587 281425 40.783098 -73.998863 9021 25 0.016667 296.839234 357.001389 -0.708188 noisefree 40.777332 40.790397 -74.003108 -73.993607 179.396653
2011-02-01 11:48:59 2011-02-01 11:48:59 39.547233 2 0.648135 281855 40.790397 -73.993607 9021 25 0.016389 212.028025 308.031171 -0.559946 noisefree 40.783098 40.797848 -73.998863 -73.988558 178.365024
2011-02-01 11:50:00 2011-02-01 11:50:00 31.246510 2 0.529455 282292 40.797848 -73.988558 9021 25 0.016944 169.563852 275.581791 -0.501910 noisefree 40.790397 40.805108 -73.993607 -73.983015 176.760949
2011-02-01 11:51:00 2011-02-01 11:51:00 55.909114 2 0.931819 282719 40.805108 -73.983015 9021 25 0.016667 139.223779 249.288038 -0.334210 noisefree 40.797848 40.812123 -73.988558 -73.977040 176.939111
2011-02-01 11:51:59 2011-02-01 11:51:59 56.593081 2 0.927498 283161 40.812123 -73.977040 9021 25 0.016389 117.804736 228.935761 -0.267375 noisefree 40.805108 40.819577 -73.983015 -73.972080 173.217799
2011-02-01 11:52:59 2011-02-01 11:52:59 55.645975 2 0.927433 283498 40.819577 -73.972080 9021 25 0.016667 102.102191 212.679395 -0.218433 noisefree 40.812123 40.825777 -73.977040 -73.969712 167.263259
2011-02-01 11:54:00 2011-02-01 11:54:00 44.137037 2 0.747878 284038 40.825777 -73.969712 9021 25 0.016944 94.730508 199.745893 -0.253289 noisefree 40.819577 40.827680 -73.972080 -73.969818 155.908201
2011-02-01 11:55:00 2011-02-01 11:55:00 12.699531 2 0.211659 284474 40.827680 -73.969818 9021 25 0.016667 87.209797 189.952963 -0.392256 noisefree 40.825777 40.827787 -73.969712 -73.969830 176.789211
2011-02-01 11:55:59 2011-02-01 11:55:59 0.041957 2 0.000688 284891 40.827787 -73.969830 9021 25 0.016389 78.904102 181.922619 -0.433493 noisefree 40.827680 40.827768 -73.969818 -73.969822 16.434698
2011-02-01 11:57:00 2011-02-01 11:57:00 0.130777 2 0.002216 285330 40.827768 -73.969822 9021 25 0.016944 75.621048 174.943412 -0.431513 noisefree 40.827787 40.823712 -73.969830 -73.970680 145.222169
2011-02-01 11:58:00 2011-02-01 11:58:00 19.870587 2 0.331176 285767 40.823712 -73.970680 9021 25 0.016667 70.190224 168.124269 -0.299300 noisefree 40.827768 40.816678 -73.969822 -73.973952 166.997780
2011-02-01 11:58:59 2011-02-01 11:58:59 37.408324 2 0.613081 286207 40.816678 -73.973952 9021 25 0.016389 64.990948 161.709626 -0.170569 noisefree 40.823712 40.814135 -73.970680 -73.975728 170.016421
2011-02-01 12:00:00 2011-02-01 12:00:00 18.863796 2 0.319637 286465 40.814135 -73.975728 9021 25 0.016944 60.530952 156.224699 -0.266713 noisefree 40.816678 40.807387 -73.973952 -73.980472 179.821871
2011-02-01 12:01:00 2011-02-01 12:01:00 50.965198 2 0.849420 286951 40.807387 -73.980472 9021 25 0.016667 56.729056 150.934735 -0.038188 noisefree 40.814135 40.800975 -73.975728 -73.985320 178.015822
2011-02-01 12:01:59 2011-02-01 12:01:59 1.609283 2 0.026374 287381 40.800975 -73.985320 9021 25 0.016389 53.290931 146.711964 -0.352266 noisefree 40.807387 40.794682 -73.980472 -73.989685 177.653971
2011-02-01 12:03:00 2011-02-01 12:03:00 46.614413 2 0.789855 287822 40.794682 -73.989685 9021 25 0.016944 52.159085 142.337864 -0.038954 noisefree 40.800975 40.788233 -73.985320 -73.994347 178.882983
2011-02-01 12:04:00 2011-02-01 12:04:00 23.838646 2 0.397311 288246 40.788233 -73.994347 9021 25 0.016667 50.614203 138.471433 -0.193365 noisefree 40.794682 40.781585 -73.989685 -73.998477 175.986911
2011-02-01 12:04:59 2011-02-01 12:04:59 49.814717 2 0.816408 288685 40.781585 -73.998477 9021 25 0.016389 48.018603 134.778826 0.013326 noisefree 40.788233 40.775148 -73.994347 -74.002478 179.986634
2011-02-01 12:06:00 2011-02-01 12:06:00 12.465705 2 0.211224 289126 40.775148 -74.002478 9021 25 0.016944 47.111623 131.594386 -0.263278 noisefree 40.781585 40.768247 -73.998477 -74.005100 168.940486
2011-02-01 12:07:00 2011-02-01 12:07:00 47.879363 2 0.797989 289559 40.768247 -74.005100 9021 25 0.016667 45.317949 128.424187 0.019945 noisefree 40.775148 40.762453 -74.002478 -74.006388 171.728962
2011-02-01 12:08:00 2011-02-01 12:08:00 1.375533 2 0.022926 290002 40.762453 -74.006388 9021 25 0.016667 43.303818 125.789541 -0.333321 noisefree 40.768247 40.760585 -74.005100 -74.004618 124.010058
2011-02-01 12:08:59 2011-02-01 12:08:59 15.590449 2 0.255510 290447 40.760585 -74.004618 9021 25 0.016389 41.461102 123.142808 -0.210087 noisefree 40.762453 40.760372 -74.006388 -74.004245 163.185283
2011-02-01 12:10:00 2011-02-01 12:10:00 2.320442 2 0.039319 290893 40.760372 -74.004245 9021 25 0.016944 40.013838 120.795328 -0.312043 noisefree 40.760585 40.760487 -74.004618 -74.005208 22.918410
2011-02-01 12:11:00 2011-02-01 12:11:00 4.923490 2 0.082058 291343 40.760487 -74.005208 9021 25 0.016667 38.768267 118.548174 -0.285494 noisefree 40.760372 40.756263 -74.004245 -74.010872 136.475871
2011-02-01 12:11:59 2011-02-01 12:11:59 9.809292 2 0.160763 291803 40.756263 -74.010872 9021 25 0.016389 37.305313 116.371053 -0.236279 noisefree 40.760487 40.749628 -74.005208 -74.013902 151.258922
2011-02-01 12:13:00 2011-02-01 12:13:00 46.043875 2 0.780188 292236 40.749628 -74.013902 9021 25 0.016944 36.209235 114.211385 0.086109 noisefree 40.756263 40.742293 -74.010872 -74.016018 171.547068
2011-02-01 12:14:00 2011-02-01 12:14:00 14.554681 2 0.242578 292672 40.742293 -74.016018 9021 25 0.016667 34.940739 112.219505 -0.181662 noisefree 40.749628 40.735565 -74.013902 -74.017352 175.123128
2011-02-01 12:14:59 2011-02-01 12:14:59 46.131240 2 0.756040 293109 40.735565 -74.017352 9021 25 0.016389 33.756308 110.291654 0.112202 noisefree 40.742293 40.729458 -74.016018 -74.019322 173.336218
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2011-02-28 04:33:00 2011-02-28 04:33:00 6.827479 14115 0.115688 16255890 37.449962 -74.462593 10688150 30259 0.016944 26.699007 10.865874 -1.828802 noisefree 37.451003 37.441092 -74.462607 -74.462398 179.511102
2011-02-28 04:34:00 2011-02-28 04:34:00 59.149793 14115 0.985830 16256559 37.441092 -74.462398 10688150 30259 0.016667 26.613979 11.699208 2.781027 noise 37.449962 37.431145 -74.462593 -74.462288 179.374186
2011-02-28 04:34:59 2011-02-28 04:34:59 67.448387 14115 1.105404 16257288 37.431145 -74.462288 10688150 30259 0.016389 26.380523 12.808558 3.206283 noise 37.441092 37.427828 -74.462398 -74.462242 179.839062
2011-02-28 04:37:00 2011-02-28 04:37:00 10.967336 14115 0.368624 16257543 37.427828 -74.462242 10688150 30259 0.033611 25.681004 13.186158 -1.115842 noisefree 37.431145 37.420150 -74.462288 -74.462168 179.757671
2011-02-28 04:38:00 2011-02-28 04:38:00 51.194617 14115 0.853244 16258093 37.420150 -74.462168 10688150 30259 0.016667 25.690527 13.565677 1.880045 noise 37.427828 37.414508 -74.462242 -74.462148 179.650908
2011-02-28 04:38:59 2011-02-28 04:38:59 38.255819 14115 0.626970 16258511 37.414508 -74.462148 10688150 30259 0.016389 25.463178 13.552259 0.943949 noisefree 37.420150 37.409195 -74.462168 -74.462103 179.717832
2011-02-28 04:40:00 2011-02-28 04:40:00 34.844523 14115 0.590421 16258950 37.409195 -74.462103 10688150 30259 0.016944 24.844263 13.545025 0.738298 noisefree 37.414508 37.403897 -74.462148 -74.462068 179.893234
2011-02-28 04:41:00 2011-02-28 04:41:00 35.324937 14115 0.588749 16259362 37.403897 -74.462068 10688150 30259 0.016667 24.862807 13.529741 0.773269 noisefree 37.409195 37.398867 -74.462103 -74.462085 179.427851
2011-02-28 04:41:59 2011-02-28 04:41:59 34.106115 14115 0.558961 16259752 37.398867 -74.462085 10688150 30259 0.016389 24.644712 13.468961 0.702460 noisefree 37.403897 37.392893 -74.462068 -74.462010 179.087080
2011-02-28 04:43:00 2011-02-28 04:43:00 39.180656 14115 0.663894 16260231 37.392893 -74.462010 10688150 30259 0.016944 23.851890 13.444322 1.140167 noisefree 37.398867 37.389265 -74.462085 -74.461980 179.754493
2011-02-28 04:44:00 2011-02-28 04:44:00 24.190237 14115 0.403171 16260532 37.389265 -74.461980 10688150 30259 0.016667 23.663568 13.321324 0.039536 noisefree 37.392893 37.382200 -74.462010 -74.461917 179.962865
2011-02-28 04:44:59 2011-02-28 04:44:59 47.905544 14115 0.785119 16261100 37.382200 -74.461917 10688150 30259 0.016389 23.457797 13.595228 1.798259 noise 37.389265 37.378492 -74.461980 -74.461887 179.952644
2011-02-28 04:46:00 2011-02-28 04:46:00 24.318321 14115 0.412060 16261423 37.378492 -74.461887 10688150 30259 0.016944 22.990270 13.459589 0.098669 noisefree 37.382200 37.371662 -74.461917 -74.461802 179.750533
2011-02-28 04:47:00 2011-02-28 04:47:00 45.541299 14115 0.759022 16261945 37.371662 -74.461802 10688150 30259 0.016667 22.749130 13.639282 1.671068 noise 37.378492 37.365953 -74.461887 -74.461723 179.920218
2011-02-28 04:47:59 2011-02-28 04:47:59 38.712303 14115 0.634452 16262399 37.365953 -74.461723 10688150 30259 0.016389 22.549576 13.687563 1.180833 noisefree 37.371662 37.360842 -74.461802 -74.461705 179.408987
2011-02-28 04:49:00 2011-02-28 04:49:00 33.519109 14115 0.567963 16262832 37.360842 -74.461705 10688150 30259 0.016944 21.924196 13.895612 0.834430 noisefree 37.365953 37.355627 -74.461723 -74.461620 179.267995
2011-02-28 04:50:00 2011-02-28 04:50:00 34.773966 14115 0.579566 16263258 37.355627 -74.461620 10688150 30259 0.016667 21.614305 14.006441 0.939543 noisefree 37.360842 37.350478 -74.461705 -74.461582 179.489050
2011-02-28 04:50:59 2011-02-28 04:50:59 34.913475 14115 0.572193 16263677 37.350478 -74.461582 10688150 30259 0.016389 21.426355 13.997496 0.963538 noisefree 37.355627 37.345168 -74.461620 -74.461458 179.085102
2011-02-28 04:51:59 2011-02-28 04:51:59 35.410561 14115 0.590176 16264099 37.345168 -74.461458 10688150 30259 0.016667 20.936574 14.128152 1.024478 noisefree 37.350478 37.340997 -74.461582 -74.461373 179.829721
2011-02-28 04:53:00 2011-02-28 04:53:00 27.357856 14115 0.463564 16264418 37.340997 -74.461373 10688150 30259 0.016944 20.688283 14.281430 0.467010 noisefree 37.345168 37.334135 -74.461458 -74.461288 179.542232
2011-02-28 04:54:00 2011-02-28 04:54:00 45.754651 14115 0.762578 16264941 37.334135 -74.461288 10688150 30259 0.016667 20.339138 14.666389 1.732909 noise 37.340997 37.329013 -74.461373 -74.461208 179.814866
2011-02-28 04:54:59 2011-02-28 04:54:59 34.732478 14115 0.569227 16265362 37.329013 -74.461208 10688150 30259 0.016389 19.988463 14.667755 1.005199 noisefree 37.334135 37.323703 -74.461288 -74.461183 179.374928
2011-02-28 04:56:00 2011-02-28 04:56:00 34.824304 14115 0.590078 16265760 37.323703 -74.461183 10688150 30259 0.016944 19.713691 14.935207 1.011744 noisefree 37.329013 37.318127 -74.461208 -74.461162 179.946031
2011-02-28 04:57:00 2011-02-28 04:57:00 37.178185 14115 0.619636 16266246 37.318127 -74.461162 10688150 30259 0.016667 19.403255 15.122906 1.175365 noisefree 37.323703 37.312807 -74.461183 -74.461112 179.677305
2011-02-28 04:57:59 2011-02-28 04:57:59 36.073350 14115 0.591202 16266654 37.312807 -74.461112 10688150 30259 0.016389 19.068717 15.157932 1.121831 noisefree 37.318127 37.304002 -74.461162 -74.461067 179.754343
2011-02-28 05:00:00 2011-02-28 05:00:00 29.111347 14115 0.978465 16267627 37.304002 -74.461067 10688150 30259 0.033611 18.957434 15.134344 0.670919 noisefree 37.312807 37.298173 -74.461112 -74.461033 179.958624
2011-02-28 05:00:59 2011-02-28 05:00:59 39.524050 14115 0.647755 16268079 37.298173 -74.461033 10688150 30259 0.016389 18.792587 15.250253 1.359418 noisefree 37.304002 37.292408 -74.461067 -74.460972 179.727968
2011-02-28 05:02:00 2011-02-28 05:02:00 37.809382 14115 0.640659 16268513 37.292408 -74.460972 10688150 30259 0.016944 18.200998 15.512716 1.264020 noisefree 37.298173 37.283910 -74.461033 -74.460953 179.521874
2011-02-28 05:03:59 2011-02-28 05:03:59 28.568351 14115 0.944343 16269165 37.283910 -74.460953 10688150 30259 0.033056 17.729819 15.579572 0.695689 noisefree 37.292408 37.275542 -74.460972 -74.460938 179.974602
2011-02-28 05:06:00 2011-02-28 05:06:00 27.666324 14115 0.929896 16269844 37.275542 -74.460938 10688150 30259 0.033611 17.207079 15.909695 0.657413 noisefree 37.283910 NaN -74.460953 NaN NaN

367696 rows × 19 columns

In [563]:
# subV = Angledata[Angledata['subVoyageIDs_subV'] == 26226]  
# subV_Noise = dataNoiseRecord_distAngle[dataNoiseRecord_distAngle['subVoyageIDs_subV'] == 26226]  

# plt.plot(subV.lon, subV.lat, '-go')
# plt.plot(subV_Noise.lon, subV_Noise.lat, 'ro')
# plt.show()
In [565]:
len(Angledata.subVoyageIDs_subV.unique())
Out[565]:
627
In [566]:
for idx, ids in enumerate(angleUnique[50:100]):
    subV = Angledata[Angledata['subVoyageIDs_subV'] == ids]
    subV_noise = dataNoiseRecord_distAngle[dataNoiseRecord_distAngle['subVoyageIDs_subV'] == ids]
    
    display(ids)
    display(subV_noise.angle)
    plt.plot(subV.lon, subV.lat, '--go')
    plt.plot(subV_noise.lon, subV_noise.lat, 'r*')
    plt.show()
1485
DateTime
2011-02-03 22:57:00    78.643524
Name: angle, dtype: float64
1489
DateTime
2011-02-07 14:19:59    50.766817
Name: angle, dtype: float64
1497
DateTime
2011-02-10 00:49:59    38.001101
Name: angle, dtype: float64
1500
DateTime
2011-02-11 00:49:59    67.468782
Name: angle, dtype: float64
1501
DateTime
2011-02-11 14:13:59    59.353279
Name: angle, dtype: float64
1504
DateTime
2011-02-14 23:10:00    13.272166
Name: angle, dtype: float64
1517
DateTime
2011-02-18 14:26:59    59.925963
Name: angle, dtype: float64
1529
DateTime
2011-02-24 14:13:59    45.619868
Name: angle, dtype: float64
1532
DateTime
2011-02-25 14:42:59    73.916922
Name: angle, dtype: float64
1653
DateTime
2011-02-04 11:18:00    3.351976
Name: angle, dtype: float64
1730
DateTime
2011-02-01 02:19:00    33.114638
Name: angle, dtype: float64
1732
DateTime
2011-02-01 18:46:00    11.413354
Name: angle, dtype: float64
1742
DateTime
2011-02-05 15:56:59    83.237343
Name: angle, dtype: float64
1779
DateTime
2011-02-26 18:43:59    36.337156
Name: angle, dtype: float64
1853
DateTime
2011-02-07 06:23:00    9.969672
Name: angle, dtype: float64
1858
DateTime
2011-02-09 04:11:00    4.284159
2011-02-09 04:14:00    5.403070
Name: angle, dtype: float64
2216
DateTime
2011-02-01 02:10:59    0.10476
Name: angle, dtype: float64
2218
DateTime
2011-02-04 11:22:00    0.587634
2011-02-05 11:13:00    0.280207
Name: angle, dtype: float64
2226
DateTime
2011-02-10 23:10:00    0.966139
Name: angle, dtype: float64
2235
DateTime
2011-02-13 11:06:59    1.027577
Name: angle, dtype: float64
2236
DateTime
2011-02-14 06:24:59    0.191045
2011-02-14 16:15:00    1.997542
2011-02-14 16:25:00    0.506818
2011-02-14 16:31:00    0.248168
Name: angle, dtype: float64
2246
DateTime
2011-02-20 22:57:59    0.274027
2011-02-21 11:28:00    0.588925
Name: angle, dtype: float64
2248
DateTime
2011-02-23 20:39:00    1.660739
2011-02-23 21:01:59    0.004809
2011-02-24 04:30:00    0.019330
Name: angle, dtype: float64
2249
DateTime
2011-02-27 01:20:00    0.606422
2011-02-27 02:48:59    3.954956
Name: angle, dtype: float64
2250
DateTime
2011-02-28 06:24:59    1.507801
Name: angle, dtype: float64
2299
DateTime
2011-02-03 03:56:59    84.936377
Name: angle, dtype: float64
2318
DateTime
2011-02-10 03:55:00    84.59106
Name: angle, dtype: float64
2342
DateTime
2011-02-18 22:54:00    86.459159
Name: angle, dtype: float64
2523
DateTime
2011-02-07 05:23:59    55.324389
2011-02-07 07:18:59    41.400057
Name: angle, dtype: float64
2530
DateTime
2011-02-12 00:04:00    84.247278
Name: angle, dtype: float64
2534
DateTime
2011-02-12 05:29:59    59.55541
Name: angle, dtype: float64
2543
DateTime
2011-02-23 07:04:00    54.864297
Name: angle, dtype: float64
2568
DateTime
2011-02-08 22:56:00    77.312579
2011-02-09 00:49:59    41.901690
Name: angle, dtype: float64
2593
DateTime
2011-02-16 12:13:00    61.970912
2011-02-16 17:52:59     6.188625
Name: angle, dtype: float64
2597
DateTime
2011-02-17 21:20:59    88.968536
Name: angle, dtype: float64
2611
DateTime
2011-02-24 14:28:00    58.44483
Name: angle, dtype: float64
2772
DateTime
2011-02-19 00:20:00    67.432042
Name: angle, dtype: float64
2784
DateTime
2011-02-25 22:59:00    85.183061
Name: angle, dtype: float64
2821
DateTime
2011-02-27 12:10:00    9.992646
2011-02-27 12:26:00    1.852791
Name: angle, dtype: float64
3115
DateTime
2011-02-21 03:27:59    12.807361
Name: angle, dtype: float64
3117
DateTime
2011-02-21 16:15:59    1.007612
Name: angle, dtype: float64
3260
DateTime
2011-02-12 14:10:59    0.112553
Name: angle, dtype: float64
3271
DateTime
2011-02-16 18:27:59    2.648088
Name: angle, dtype: float64
3285
DateTime
2011-02-23 20:52:59    0.05249
Name: angle, dtype: float64
3297
DateTime
2011-02-01 14:15:00    49.481475
2011-02-01 18:27:00    18.201962
Name: angle, dtype: float64
3299
DateTime
2011-02-02 22:53:00    42.428016
2011-02-02 22:54:59    30.763865
Name: angle, dtype: float64
3306
DateTime
2011-02-08 22:28:59     6.095714
2011-02-09 00:11:00    11.779580
Name: angle, dtype: float64
3309
DateTime
2011-02-10 20:05:00    24.040108
2011-02-10 22:21:59     2.094897
Name: angle, dtype: float64
3312
DateTime
2011-02-14 18:56:59    63.237385
2011-02-14 22:05:59     0.364330
2011-02-14 22:06:59     2.852623
2011-02-14 22:08:00     2.436923
Name: angle, dtype: float64
3321
DateTime
2011-02-23 21:30:59    16.175942
Name: angle, dtype: float64
In [292]:
subV = Angledata[Angledata['subVoyageIDs_subV'] == 8111]  

coordinates = list(zip(subV.lat.tolist(), subV.lon.tolist()))
m = folium.Map(location=[41.8240, -71.4128])
folium.PolyLine(coordinates, color = "Black").add_to(m)
m
Out[292]:
In [397]:
dataNoiseRecord_distance= dataNoiseRecord[dataNoiseRecord.distance > 1.4]
dataNoiseRecord_distAngle = dataNoiseRecord_distance[dataNoiseRecord_distance.angle.between(0, 90)]  
In [398]:
len(dataNoiseRecord_distAngle.subVoyageIDs_subV.unique())
Out[398]:
10
In [399]:
fig, ax = plt.subplots(figsize=(16,10))
ax.scatter(dataNoiseRecord_distAngle.angle, dataNoiseRecord_distAngle.distance)
ax.set_xlabel('angle')
ax.set_ylabel('distance')
plt.show()
In [400]:
angleUnique = dataNoiseRecord_distAngle.subVoyageIDs_subV.unique()
In [401]:
Angledata = dfdata[dfdata.subVoyageIDs_subV.isin(dataNoiseRecord_distAngle.subVoyageIDs_subV)]
In [402]:
for idx, ids in enumerate(angleUnique[0:50]):
    subV = Angledata[Angledata['subVoyageIDs_subV'] == ids]
    subV_noise = dataNoiseRecord_distAngle[dataNoiseRecord_distAngle['subVoyageIDs_subV'] == ids]
    
    display(ids)
    display(subV_noise.angle)
    plt.plot(subV.lon, subV.lat, '--go')
    plt.plot(subV_noise.lon, subV_noise.lat, 'r*')
    plt.show()
1384
DateTime
2011-02-22 16:24:00    1.200817
Name: angle, dtype: float64
10259
DateTime
2011-02-02 10:20:00     2.224125
2011-02-02 10:22:59     0.212730
2011-02-02 10:24:00    67.773539
Name: angle, dtype: float64
11957
DateTime
2011-02-08 17:55:59    1.399038
Name: angle, dtype: float64
20861
DateTime
2011-02-07 20:29:59    0.642394
Name: angle, dtype: float64
23033
DateTime
2011-02-12 03:43:59    1.578199
Name: angle, dtype: float64
25374
DateTime
2011-02-17 04:20:00    0.150925
Name: angle, dtype: float64
27340
DateTime
2011-02-20 23:26:59    35.091164
Name: angle, dtype: float64
29045
DateTime
2011-02-24 20:58:59    4.374954
Name: angle, dtype: float64
29675
DateTime
2011-02-26 16:21:59    2.128243
Name: angle, dtype: float64
30156
DateTime
2011-02-28 18:49:59    0.038356
Name: angle, dtype: float64
In [403]:
subV = Angledata[Angledata['subVoyageIDs_subV'] == 1384]  

coordinates = list(zip(subV.lat.tolist(), subV.lon.tolist()))
m = folium.Map(location=[41.8240, -71.4128])
folium.PolyLine(coordinates, color = "Black").add_to(m)
m
Out[403]:

Distributions

  • 0 - 90 at distance less then 2.0 is the noise -- 744 total noise points
  • 90 - 150 at distance less than 2.0 is corner points -- 726 total corner points
  • 150 - 180 at distance less than 2.0 is not a noise -- 210647 regular points
  • 0 - 90 at distance greater then 2.0 is the noise -- 6 total noise points
  • 90 - 150 at distance greater than 2.0 is corner points -- 6 corner points
  • 150 - 180 at distance greater than 2.0 are datagap points -- 325 gap points